suppressPackageStartupMessages({
library("EnhancedVolcano")
library("limma")
library("biomaRt")
library("annotables")
library("tximport")
library("apeglm")
library("eulerr")
library("DESeq2")
library("HGNChelper")
library("tictoc")
library("DESeq2")
library("kableExtra")
library("beeswarm")
library("missMethyl")
library("gridExtra")
library("png")
library("metafor")
library("ggplot2")
library("purrr")
library("metafor")
library("AnnotationDbi")
library("readxl")
library("ggplot2")
library("tidyverse")
library("magrittr")
library("readr")
library("eulerr")
library("RColorBrewer")
library("e1071")
library("plotly")
library("pheatmap")
library("msigdbr")
library("mitch")
library("rtracklayer")
library("dplyr")
library("org.Mm.eg.db")
library("fgsea")
})
CORES=16
base<- "/mnt/vol1/Mouse_model_RNA_Seq/kallisto_results_plus_ncRNA/"
idx<- "/mnt/vol1/Mouse_model_RNA_Seq/index"
samples<- read_tsv(file.path(base,"samples.txt"),col_names = "sample", show_col_types = FALSE)
samples$path <- file.path(base, samples$sample, "abundance.tsv")
align<- read_delim("/mnt/vol1/Mouse_model_RNA_Seq/kallisto_results_plus_ncRNA/alignment_summary.tsv")
old_align<-read_csv("/mnt/vol1/Mouse_model_RNA_Seq/kallisto_results/aligment_no_ncrna.csv", show_col_types = FALSE)
new_df<- align%>% mutate(index = "cDNA + ncRNA", rate = 100 * pseudoaligned/processed) %>%
select(sample, index, rate)
old_df <- old_align %>% mutate(index = "cDNA only", rate = 100 * pseudoaligned/processed) %>%
select(sample, index, rate)
df<- bind_rows(new_df, old_df)
delta<- df %>% pivot_wider(names_from = index, values_from = rate) %>%
mutate(delta = `cDNA + ncRNA` - `cDNA only`) %>%
arrange(delta)
df$sample <- factor(df$sample, levels = delta$sample)
ggplot(df, aes(sample, rate, fill = index)) +
geom_col(position = position_dodge(width = 0.75), width = 0.7) +
labs(title = "Pseudoalignment rate by sample",
x = "Sample", y = "Rate (%)", fill = "Run") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
tx2gene<- read.csv(file.path("/mnt/vol1/Mouse_model_RNA_Seq/tx2gene_r115.csv"), stringsAsFactors = FALSE)
stopifnot(all(c("TXNAME","GENEID") %in% names(tx2gene)))
files<- file.path(base, samples$sample, "abundance.tsv")
names(files) <- samples$sample
stopifnot(all(file.exists(files)))
txi<- tximport(files,
type = "kallisto",
tx2gene = tx2gene[, c("TXNAME","GENEID")],
countsFromAbundance = "lengthScaledTPM",
ignoreTxVersion = TRUE)
txi$counts[1:2, 1:6]
## BB329 BB330 BB331 BB332 BB334 BB335
## ENSMUSG00000000001 3423.529 4452.743 3423.237 3360.252 4088.225 2816.038
## ENSMUSG00000000003 0.000 0.000 0.000 0.000 0.000 0.000
pheno<- read_excel("/mnt/vol1/Mouse_model_RNA_Seq/minipump.xlsx",skip =1 )
colnames(pheno)
## [1] "Sample" "Sex"
## [3] "Intervention" "RNA concentration\r\nng/ul"
## [5] "RIN" "Well Number"
coldata<- pheno %>%
filter(Sample %in% samples$sample) %>%
arrange(match(Sample, samples$sample)) %>%
mutate(
Sex = factor(Sex, levels = c("F","M")), # F as baseline
Treatment = factor(Intervention, levels = c("Saline","Ang-II"))) # saline as baseline
rownames(coldata)<- coldata$Sample
dds<- DESeqDataSetFromTximport(txi, colData = coldata, design = ~ Sex + Treatment + Sex:Treatment)
keep<- rowSums(counts(dds)) >= 10
dds<- dds[keep, ]
dds<- estimateSizeFactors(dds)
norm_counts<- counts(dds, normalized = TRUE)
glimpse(norm_counts)
## num [1:41594, 1:17] 3210.9 157.5 1444.1 196.9 10.3 ...
## - attr(*, "dimnames")=List of 2
## ..$ : chr [1:41594] "ENSMUSG00000000001" "ENSMUSG00000000028" "ENSMUSG00000000031" "ENSMUSG00000000037" ...
## ..$ : chr [1:17] "BB329" "BB330" "BB331" "BB332" ...
vsd<- vst(dds, blind = TRUE)
ann<- as.data.frame(colData(dds))[, c("Sex","Treatment"), drop = FALSE]
#sample–sample correlation heatmap
pheatmap(cor(assay(vsd)),
annotation_col = ann,
annotation_row = ann)
# PCA (colour = Treatment, shape = sex)
plotPCA(vsd, intgroup = c("Treatment","Sex"))
## Fit model (set baselines)
dds$Sex<- relevel(factor(dds$Sex), "F")
dds$Treatment<- relevel(factor(dds$Treatment), "Saline")
design(dds)<- ~ Sex + Treatment + Sex:Treatment
dds<- DESeq(dds)
resultsNames(dds)
## [1] "Intercept" "Sex_M_vs_F"
## [3] "Treatment_Ang.II_vs_Saline" "SexM.TreatmentAng.II"
# baseline (saline): M vs F
coef_sex<- "Sex_M_vs_F"
# Ang-II vs Saline in females (F is baseline sex)
coef_trt<- "Treatment_Ang.II_vs_Saline"
# (Ang-II effect in M) & (Ang-II effect in F)
coef_int<- "SexM.TreatmentAng.II"
### RAW results ###
# 1. Within each Sex
# 1.1 Female: Ang-II vs Saline
res_F_raw<- results(dds, name = coef_trt)
# 1.2 Male: Ang-II vs Saline
res_M_raw<- results(dds, contrast = list(c(coef_trt, coef_int)))
# 2. Effect of Ang-II treatment between sex
res_int_raw<- results(dds, name = coef_int)
# 3. Baseline (saline): M vs F
res_bsl_raw<- results(dds, name = coef_sex)
# Shrinkage results
res_F_shr<- lfcShrink(dds, coef = coef_trt, type = "ashr")
res_M_shr<- lfcShrink(dds, contrast= list(c(coef_trt, coef_int)), type = "ashr")
res_int_shr<- lfcShrink(dds, coef = coef_int, type = "ashr")
res_bsl_shr<- lfcShrink(dds, coef = coef_sex, type = "ashr")
make_df<- function(raw, shr, label){
data.frame(
gene = rownames(raw),
lfc_raw= raw$log2FoldChange,
lfc_shr = shr$log2FoldChange,
FDR_raw = raw$padj,
contrast = label,
stringsAsFactors = FALSE)
}
res_all<- bind_rows(
make_df(res_F_raw, res_F_shr, "Female: Ang-II vs Saline"),
make_df(res_M_raw, res_M_shr, "Male: Ang-II vs Saline"),
make_df(res_int_raw, res_int_shr,"Interaction: (M & F) treatment effect"),
make_df(res_bsl_raw, res_bsl_shr, "Baseline (Saline): M vs F")
)
# Raw vs Shrunken
res_long<- res_all %>%
mutate(
hit_raw= !is.na(FDR_raw) & FDR_raw<= 0.05 & abs(lfc_raw)>= 0.5,
hit_shr= !is.na(FDR_raw) & FDR_raw<= 0.05 & abs(lfc_shr)>= 0.5
) %>%
dplyr::select(gene, contrast, FDR_raw, lfc_raw, lfc_shr, hit_raw, hit_shr) %>%
tidyr::pivot_longer(
cols = c(lfc_raw, lfc_shr, hit_raw, hit_shr),
names_to= c(".value", "lfc_type"),
names_sep= "_"
) %>%
mutate(
lfc_type = dplyr::recode(lfc_type,
raw = "Raw LFC",
shr = "Shrunken LFC"),
lfc_type = factor(lfc_type, levels = c("Raw LFC", "Shrunken LFC")),
contrast = factor(
contrast,
levels = c(
"Female: Ang-II vs Saline",
"Male: Ang-II vs Saline",
"Interaction: (M & F) treatment effect",
"Baseline (Saline): M vs F"
)
)
)
x_max<- quantile(abs(res_long$lfc), 0.995, na.rm = TRUE)
x_lim<- c(-max(1.5, x_max), max(1.5, x_max))
y_max<- quantile(-log10(res_long$FDR_raw), 0.995, na.rm = TRUE)
y_lim<- c(0, max(5, y_max))
ggplot(res_long, aes(lfc, -log10(FDR_raw), colour = hit)) +
geom_point(alpha = 0.6, size = 1.2) +
geom_vline(xintercept = c(-0.5, 0.5), linetype = 2) +
geom_hline(yintercept = -log10(0.05), linetype = 2) +
coord_cartesian(xlim = x_lim, ylim = y_lim) +
facet_grid(contrast ~ lfc_type, switch = "y") +
scale_colour_manual(values = c("FALSE" = "grey60", "TRUE" = "steelblue4"),
labels = c("FALSE" = "FDR>0.05 or LFC<0.5", "TRUE" = "FDR<=0.05 & LFC>=0.5"),name = NULL) +
labs(
title = "Volcano comparison: Raw vs Shrunken (ashr) with LFC filtering",
x = "log2 fold change",
y = expression(-log[10]("FDR from unshrunken"))) +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.placement = "outside",
strip.background = element_blank(),
strip.text.x = element_text(face = "bold"),
strip.text.y.left = element_text(angle = 0, face = "bold"),
panel.grid.minor = element_blank())
## Without LFC filtering
# Raw vs Shrunken
res_long_nolfc<- res_all %>%
mutate(
sig_raw = !is.na(FDR_raw) & FDR_raw <= 0.05,
sig_shr = !is.na(FDR_raw) & FDR_raw <= 0.05
) %>%
dplyr::select(gene, contrast, FDR_raw, lfc_raw, lfc_shr, sig_raw, sig_shr) %>%
tidyr::pivot_longer(
cols= c(lfc_raw, lfc_shr, sig_raw, sig_shr),
names_to= c(".value", "lfc_type"),
names_sep= "_") %>%
mutate(
lfc_type = recode(lfc_type,
raw = "Raw LFC (before)",
shr = "Shrunken LFC (after)"),
lfc_type = factor(lfc_type, levels = c("Raw LFC (before)", "Shrunken LFC (after)")),
contrast= factor(contrast, levels = c(
"Female: Ang-II vs Saline",
"Male: Ang-II vs Saline",
"Interaction: (M & F) treatment effect",
"Baseline (Saline): M vs F"
)))
x_max<- quantile(abs(res_long_nolfc$lfc), 0.995, na.rm = TRUE)
x_lim<- c(-max(1.5, x_max), max(1.5, x_max))
y_max<- quantile(-log10(res_long_nolfc$FDR_raw), 0.995, na.rm = TRUE)
y_lim<- c(0, max(5, y_max))
ggplot(res_long_nolfc, aes(lfc, -log10(FDR_raw), colour = sig)) +
geom_point(alpha = 0.6, size = 1.2) +
geom_hline(yintercept = -log10(0.05), linetype = 2) +
coord_cartesian(xlim = x_lim, ylim = y_lim) +
facet_grid(contrast ~ lfc_type, switch = "y") +
scale_colour_manual(
values = c("FALSE" = "grey60", "TRUE" = "steelblue4"),
labels = c("FALSE" = "FDR > 0.05", "TRUE" = "FDR ≤ 0.05"),
name = NULL) +
labs(
title = "Volcano comparison: Raw vs Shrunken (ashr) without LFC filtering",
x = "log2 fold change",
y = expression(-log[10]("FDR (from unshrunken fit)"))) +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.placement = "outside",
strip.background = element_blank(),
strip.text.x = element_text(face = "bold"),
strip.text.y.left = element_text(angle = 0, face = "bold"),
panel.grid.minor = element_blank())
The interaction coefficient (SexM.TreatmentAng.II) is a difference-in-differences. If β_int ~ 0 for most genes, it means the Ang-II effect is about the same in males and females. Therefore, it could be assume that there can be still many DE genes within each sex but with small effects, however, the extra difference between males and females could almost be negligible for Ang-II.
Reduced model The LRT is comparing the full model to the reduced model to identify significant genes. The p-values are determined solely by the difference in deviance between the ‘full’ and ‘reduced’ model formula (not log2 fold changes). Essentially the LRT test is testing whether the term(s) removed in the ‘reduced’ model explains a significant amount of variation in the data. Quoted https://hbctraining.github.io/DGE_workshop/lessons/08_DGE_LRT.html
Context:
So in the case, where females are baseline/ reference; * β_int > 0: Ang-II effect is larger in males than females.
Results of the preliminary states that;
Group sizes: F (4 Ang-II, 3 Saline), M (4 Ang-II, 6 Saline) = unbalanced and small
F vs M LFC scatter: global Pearson r ≈ 0.056 is small but could be because ~95% of genes are near 0, hence noise dominates the correlation.
Summary of the full model (with interaction) to the reduced model (without interaction) gene-by-gene, shows ~0–4 genes at FDR <0.1 (and 0–1 at FDR < 0.05) out of 41594 genes. Hence it could be deduced that the interaction term adds almost no explanatory power overall.
Effect-size difference (M & F) distribution of median −0.023, centred near 0, further proving that Ang-II effect size is similar in males and females overall.
The histogram of LFC shows that the values of the res_F_raw and res_M_raw is centred around zero(median −0.023), indicating that Ang-II effects are broadly similar between sexes, consistent with the likelihood-ratio test showing negligible evidence for an interaction
# No.of saline and treatment samples of each sex
table(pheno$Sex, pheno$Intervention)
##
## Ang-II Saline
## F 4 3
## M 4 6
# correlation of the interaction
summary(res_F_raw$lfcSE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05636 0.17710 0.40386 0.67548 0.90210 4.80267
summary(res_M_raw$lfcSE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04774 0.15084 0.35101 0.58459 0.78546 4.05917
cor.test(res_F_raw$log2FoldChange, res_M_raw$log2FoldChange, use="complete.obs")
##
## Pearson's product-moment correlation
##
## data: res_F_raw$log2FoldChange and res_M_raw$log2FoldChange
## t = 11.464, df = 41592, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04653857 0.06569855
## sample estimates:
## cor
## 0.05612373
lfc_F<- res_F_raw$log2FoldChange
lfc_M<- res_M_raw$log2FoldChange
delta<- lfc_M - lfc_F
summary(delta)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -30.00000 -0.32066 -0.02349 -0.03099 0.33085 21.20516
# Model without interaction term (Hypothesis testing: Likelihood ratio test (LRT))
dds_lrt<- DESeq(dds, test = "LRT", reduced = ~ Sex + Treatment)
lrt<- results(dds_lrt)
summary(lrt)
##
## out of 41594 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 3, 0.0072%
## LFC < 0 (down) : 1, 0.0024%
## outliers [1] : 21, 0.05%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
lrt05<- results(dds_lrt, alpha = 0.05)
summary(lrt05)
##
## out of 41594 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 0, 0%
## LFC < 0 (down) : 1, 0.0024%
## outliers [1] : 21, 0.05%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# Distribution of M & F interaction LFC
hist(res_int_raw$log2FoldChange, breaks=80)
Interaction heatmap shows the “top 40 by smallest FDR”
top_heatmap<- function(res_shr, vsd, label, n=40) {
keep<- order(res_shr$padj, na.last = NA)[1:min(n, sum(!is.na(res_shr$padj)))]
genes<- rownames(res_shr)[keep]
mat<- assay(vsd)[genes, , drop=FALSE]
mat<- scale(t(scale(t(mat))))
ann<- as.data.frame(colData(vsd))[, c("Sex","Treatment"), drop=FALSE]
pheatmap(mat, annotation_col = ann, show_rownames = TRUE,
main = paste0("Top ", n, " DE genes: ", label))
}
top_heatmap(res_F_shr, vsd, "F Ang-II vs Saline")
top_heatmap(res_M_shr, vsd, "M Ang-II vs Saline")
top_heatmap(res_int_shr, vsd, "Interaction (M & F)")
top_heatmap(res_bsl_shr, vsd, "Baseline (M vs F, Saline)")
# Female-only view of female contrast genes
female_cols<- colData(vsd)$Sex == "F"
top_heatmap(res_F_shr, vsd[, female_cols], "F Ang-II vs Saline (female samples)")
# Male-only view of male contrast genes
male_cols<- colData(vsd)$Sex == "M"
top_heatmap(res_M_shr, vsd[, male_cols], "M Ang-II vs Saline (male samples)")
top_heatmap_sig<- function(res_shr, vsd, label, n=40, alpha=0.05){
sig<- which(!is.na(res_shr$padj) & res_shr$padj <= alpha)
if (length(sig) == 0) {
message("No genes pass FDR <= ", alpha, " for: ", label)
return(invisible(NULL))
}
keep<- head(sig[order(res_shr$padj[sig])], n)
genes<- rownames(res_shr)[keep]
mat<- assay(vsd)[genes, , drop=FALSE]
mat<- scale(t(scale(t(mat))))
ann<- as.data.frame(colData(vsd))[, c("Sex","Treatment"), drop=FALSE]
pheatmap(mat, annotation_col=ann, show_rownames=TRUE,
main=sprintf("Top %d genes (FDR <= %.2g): %s", length(genes), alpha, label))
}
top_heatmap_sig(res_F_shr, vsd, "F Ang-II vs Saline")
top_heatmap_sig(res_M_shr, vsd, "M Ang-II vs Saline")
top_heatmap_sig(res_bsl_shr, vsd, "Baseline (M vs F, Saline)")
sum(res_int_shr$padj <= 0.05, na.rm=TRUE)
## [1] 1
# Female-only view of female contrast genes
female_cols<- colData(vsd)$Sex == "F"
top_heatmap_sig(res_F_shr, vsd[, female_cols], "F Ang-II vs Saline (female samples)")
# Male-only view of male contrast genes
male_cols<- colData(vsd)$Sex == "M"
top_heatmap_sig(res_M_shr, vsd[, male_cols], "M Ang-II vs Saline (male samples)")
MSigDB “Hallmarks” (collection H) = 50 core biological with minimal overlap gene sets that capture broad, well-defined biological processes.
# Map Ensembl to gene symbols
gtf<- file.path(idx, "Mus_musculus.GRCm39.115.gtf")
gtf_df<- as.data.frame(rtracklayer::import(gtf))
# For symbols
ann<- gtf_df %>%
filter(type == "gene") %>%
transmute(
ensgene = sub("\\.\\d+$", "", gene_id),
symbol = gene_name) %>%
distinct()
head(ann)
## ensgene symbol
## 1 ENSMUSG00000104478 Gm38212
## 2 ENSMUSG00000104385 Gm7449
## 3 ENSMUSG00000101231 Gm28283
## 4 ENSMUSG00000101097 Gm6679
## 5 ENSMUSG00000100764 Gm29155
## 6 ENSMUSG00000100831 Gm17847
# map ensembl to entrez
map_ensg_to_entrez<- function(ens_ids){
ens_ids <- sub("\\.\\d+$","", ens_ids)
tibble(ENSEMBL = ens_ids) %>%
mutate(ENTREZID = AnnotationDbi::mapIds(
org.Mm.eg.db, keys = ENSEMBL, keytype = "ENSEMBL",
column = "ENTREZID", multiVals = "first")) %>%
filter(!is.na(ENTREZID)) %>% distinct()
}
# rank vectors on entrez
rankify_entrez<- function(res_raw, ann_map){
df<- as.data.frame(res_raw) %>%
rownames_to_column("ensgene") %>%
mutate(ensgene = sub("\\.\\d+$","", ensgene)) %>%
left_join(ann_map, by = c("ensgene"="ENSEMBL")) %>%
filter(!is.na(stat), !is.na(ENTREZID))
split(df$stat, df$ENTREZID) %>% vapply(\(v) v[which.max(abs(v))], numeric(1))
}
ens_all<- unique(c(rownames(res_F_raw), rownames(res_M_raw),
rownames(res_int_raw), rownames(res_bsl_raw)))
ann_entrez<- map_ensg_to_entrez(ens_all)
ranks_F_e<- rankify_entrez(res_F_raw, ann_entrez)
ranks_M_e<- rankify_entrez(res_M_raw, ann_entrez)
ranks_Int_e<- rankify_entrez(res_int_raw, ann_entrez)
ranks_Bas_e<- rankify_entrez(res_bsl_raw, ann_entrez)
# Hallmark pathways grouped by gs_name
msig_entrez<- msigdbr(species="Mus musculus", category="H") %>%
distinct(gs_name, entrez_gene) %>%
group_by(gs_name) %>%
summarise(genes = list(unique(entrez_gene)), .groups = "drop") %>%
tibble::deframe()
# overlap
ov_Fe<- sapply(msig_entrez, function(g) sum(names(ranks_F_e) %in% g)); summary(ov_Fe)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.00 95.75 164.50 142.28 195.75 202.00
# GSEA
run_gsea<- function(ranks, label, pathways) {
fgsea::fgseaMultilevel(pathways = pathways, stats = ranks, minSize = 10, maxSize = 2000) %>% arrange(padj) %>% mutate(contrast = label)
}
gsea_F<- run_gsea(ranks_F_e, "F Ang-II vs Saline", msig_entrez)
gsea_M<- run_gsea(ranks_M_e, "M Ang-II vs Saline", msig_entrez)
gsea_Int<- run_gsea(ranks_Int_e, "Interaction (M & F)", msig_entrez)
gsea_Bas<- run_gsea(ranks_Bas_e, "Baseline (M vs F)", msig_entrez)
gsea_all<- bind_rows(gsea_F, gsea_M, gsea_Int, gsea_Bas)
length(ranks_F_e)
## [1] 22219
sum(duplicated(names(ranks_F_e)))
## [1] 0
summary(ranks_F_e)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.4778 -0.5577 0.2257 0.2475 1.0306 6.2055
Context:
4 contrasts And their interpretation
Baseline (M vs F) This is Sex_M_vs_F, so its Male − Female Red (NES > 0): pathway genes are more highly expressed in males than females. Blue (NES < 0): pathway genes are lower in males, so higher in females.
F Ang-II vs Saline This is Treatment_Ang.II_vs_Saline with F as
baseline sex = (Ang-II females) − (Saline
females) Red: pathway enriched in Ang-II females (treatment turned it
up). Blue: pathway lower in Ang-II females,so relatively higher in
Saline females.
3.M Ang-II vs Saline This IS the main + interaction = (Ang-II males) − (Saline males) Red: pathway up in Ang-II males. Blue: pathway down in Ang-II males = higher in Saline males.
alpha<- 0.05
top_k<- gsea_all %>%
dplyr::filter(!is.na(padj), padj <= alpha) %>%
dplyr::group_by(contrast) %>%
dplyr::arrange(padj, dplyr::desc(abs(NES)), pathway, .by_group = TRUE) %>%
dplyr::slice_head(n = 10) %>%
dplyr::ungroup() %>%
dplyr::mutate(
path = gsub("^HALLMARK_", "", pathway),
path = gsub("_", " ", path),
id = paste(contrast, path, sep = "|")
) %>%
dplyr::group_by(contrast) %>%
dplyr::mutate(id = factor(id, levels = rev(unique(id)))) %>%
dplyr::ungroup()
wrap_lab<- function(x, width = 32) {
vapply(x, function(s) paste(strwrap(s, width = width), collapse = "\n"), character(1))
}
ggplot(top_k, aes(x = id, y = NES, fill = NES > 0)) +
geom_col() +coord_flip() +
facet_wrap(~ contrast, scales = "free_y") +
scale_x_discrete(labels = function(x) wrap_lab(sub("^[^|]*\\|", "", x), width = 32)) +
scale_fill_manual(
values = c("TRUE" = "firebrick", "FALSE" = "steelblue"),
name = "Direction",
labels = c("FALSE" = "Enriched in genes lower in this contrast",
"TRUE" = "Enriched in genes higher in this contrast")) +
labs(
title = sprintf("Top 10 Hallmark pathways per contrast (FDR <=0.05)", alpha),
x = NULL, y = "Normalised Enrichment Score (NES)") +
theme_minimal(base_size = 11) +
theme(legend.position = "bottom")
# GO: BP pathways (msigdbr), grouped by gs_name
msig_entrez<- msigdbr(species="Mus musculus", category="C5", subcategory = "GO:BP")%>%
distinct(gs_name, entrez_gene) %>%
group_by(gs_name) %>%
summarise(genes = list(unique(entrez_gene)), .groups = "drop") %>%
tibble::deframe()
# overlap
ov_Fe<- sapply(msig_entrez, function(g) sum(names(ranks_F_e) %in% g)); summary(ov_Fe)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 8.0 18.0 73.3 54.0 1825.0
# GSEA
run_gsea<- function(ranks, label, pathways) {
fgsea::fgseaMultilevel(pathways = pathways, stats = ranks, minSize = 10, maxSize = 2000) %>% arrange(padj) %>% mutate(contrast = label)
}
gsea_F<- run_gsea(ranks_F_e, "F Ang-II vs Saline", msig_entrez)
gsea_M<- run_gsea(ranks_M_e, "M Ang-II vs Saline", msig_entrez)
gsea_Int<- run_gsea(ranks_Int_e, "Interaction (M & F)", msig_entrez)
gsea_Bas<- run_gsea(ranks_Bas_e, "Baseline (M vs F)", msig_entrez)
gsea_all<- bind_rows(gsea_F, gsea_M, gsea_Int, gsea_Bas)
length(ranks_F_e)
## [1] 22219
sum(duplicated(names(ranks_F_e)))
## [1] 0
summary(ranks_F_e)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.4778 -0.5577 0.2257 0.2475 1.0306 6.2055
top_k<- gsea_all %>%
dplyr::filter(!is.na(padj), padj <= alpha) %>%
dplyr::group_by(contrast) %>%
dplyr::arrange(padj, dplyr::desc(abs(NES)), pathway, .by_group = TRUE) %>%
dplyr::slice_head(n = 10) %>%
dplyr::ungroup() %>%
dplyr::mutate(
path_label = gsub("^GOBP_", "", pathway),
path_label = gsub("_", " ", path_label),
id = paste(contrast, path_label, sep = "|")) %>%
dplyr::group_by(contrast) %>%
dplyr::mutate(id = factor(id, levels = rev(unique(id)))) %>%
dplyr::ungroup()
ggplot(top_k, aes(x = id, y = NES, fill = NES > 0)) +
geom_col() +
coord_flip() +
facet_wrap(~ contrast, scales = "free_y") +
scale_x_discrete(labels = function(x) wrap_lab(sub("^[^|]*\\|", "", x), width = 32)) +
scale_fill_manual(
values = c("TRUE" = "firebrick", "FALSE" = "steelblue"),
name = "Direction",
labels = c("FALSE" = "Enriched among genes lower in this contrast",
"TRUE" = "Enriched among genes higher in this contrast")) +
labs(title = sprintf("Top 10 GO:BP pathways per contrast (FDR <=0.05)", alpha),
subtitle = NULL,
x = NULL, y = "Normalised Enrichment Score (NES)") +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.text = element_text(face = "bold"),
plot.title = element_text(face = "bold"))
deseq_to_prescore <- function(res_obj) {
as.data.frame(res_obj) %>%
rownames_to_column("ensgene") %>%
mutate(ensgene = sub("\\.\\d+$", "", ensgene)) %>%
filter(!is.na(stat)) %>%
dplyr::select(ensgene, score = stat) %>%
group_by(ensgene) %>%
slice_max(order_by = abs(score), n = 1) %>%
ungroup() %>%
column_to_rownames("ensgene")
}
x_F<- deseq_to_prescore(res_F_raw)
x_M <- deseq_to_prescore(res_M_raw)
x_Int<- deseq_to_prescore(res_int_raw)
x_Bas<- deseq_to_prescore(res_bsl_raw)
geneTable<- ann %>%
filter(!is.na(symbol), symbol != "") %>%
dplyr::select(gene = symbol, ensgene)
# Hallmark
msig_hall<- msigdbr(
species= "Mus musculus",
category= "H") %>%
dplyr::select(gs_name, gene_symbol) %>%
distinct() %>%
group_by(gs_name) %>%
summarise(genes = list(gene_symbol), .groups = "drop") %>%
tibble::deframe()
# GO:BP
msig_gobp<- msigdbr(
species= "Mus musculus",
category= "C5",
subcategory= "GO:BP") %>%
dplyr::select(gs_name, gene_symbol) %>%
distinct() %>%
group_by(gs_name) %>%
summarise(genes = list(gene_symbol), .groups = "drop") %>%
tibble::deframe()
# run mitch for one contrast
run_mitch_one<- function(x_mat, genesets, label) {
mobj <- mitch_import(
x= x_mat,
DEtype= "prescored",
geneTable= geneTable)
mres<- mitch_calc(
x = mobj,
genesets= genesets,
minsetsize= 5,
cores= CORES,
priority= "effect")
out<- mres$enrichment_result %>%
mutate(
contrast= label,
set_clean= gsub("_", " ", gsub("^HALLMARK_|^GOBP_", "", set)))
return(out)
}
#4 contrasts (hallmark)
hall_F<- run_mitch_one(x_F, msig_hall, "F Ang-II vs Saline")
hall_M<- run_mitch_one(x_M, msig_hall, "M Ang-II vs Saline")
hall_Int<- run_mitch_one(x_Int, msig_hall, "Interaction (M vs F)")
hall_Bas<- run_mitch_one(x_Bas, msig_hall, "Baseline M vs F")
hall_all<- bind_rows(hall_F, hall_M, hall_Int, hall_Bas)
#sig + top 10 per contrast
hall_top10<- hall_all %>%
filter(!is.na(p.adjustANOVA), p.adjustANOVA < 0.05, setSize >= 5) %>%
arrange(contrast, p.adjustANOVA, desc(abs(s.dist))) %>%
group_by(contrast) %>%
slice_head(n = 10) %>%
ungroup()
hall_top10
## # A tibble: 40 × 7
## set setSize pANOVA s.dist p.adjustANOVA contrast set_clean
## <chr> <int> <dbl> <dbl> <dbl> <chr> <chr>
## 1 HALLMARK_OXIDATIVE_… 185 2.26e-52 0.647 1.13e-50 Baselin… OXIDATIV…
## 2 HALLMARK_ADIPOGENES… 197 2.49e-36 0.519 6.24e-35 Baselin… ADIPOGEN…
## 3 HALLMARK_HEME_METAB… 193 1.75e-23 0.416 2.42e-22 Baselin… HEME MET…
## 4 HALLMARK_MTORC1_SIG… 197 1.93e-23 0.411 2.42e-22 Baselin… MTORC1 S…
## 5 HALLMARK_GLYCOLYSIS 194 1.63e-22 0.405 1.63e-21 Baselin… GLYCOLYS…
## 6 HALLMARK_FATTY_ACID… 151 6.94e-20 0.429 5.79e-19 Baselin… FATTY AC…
## 7 HALLMARK_P53_PATHWAY 196 1.13e-19 0.375 8.08e-19 Baselin… P53 PATH…
## 8 HALLMARK_MYOGENESIS 197 1.69e-18 0.362 1.06e-17 Baselin… MYOGENES…
## 9 HALLMARK_DNA_REPAIR 149 3.10e-17 0.400 1.72e-16 Baselin… DNA REPA…
## 10 HALLMARK_TNFA_SIGNA… 198 8.51e-17 0.342 4.25e-16 Baselin… TNFA SIG…
## # ℹ 30 more rows
#4 contrasts(GO:BP)
gobp_F<- run_mitch_one(x_F, msig_gobp, "F Ang-II vs Saline")
gobp_M<- run_mitch_one(x_M, msig_gobp, "M Ang-II vs Saline")
gobp_Int<- run_mitch_one(x_Int, msig_gobp, "Interaction (M vs F)")
gobp_Bas<- run_mitch_one(x_Bas, msig_gobp, "Baseline M vs F")
gobp_all<- bind_rows(gobp_F, gobp_M, gobp_Int, gobp_Bas)
gobp_top10<- gobp_all %>%
filter(!is.na(p.adjustANOVA), p.adjustANOVA < 0.05, setSize >= 5) %>%
arrange(contrast, p.adjustANOVA, desc(abs(s.dist))) %>%
group_by(contrast) %>%
slice_head(n = 10) %>%
ungroup()
gobp_top10
## # A tibble: 40 × 7
## set setSize pANOVA s.dist p.adjustANOVA contrast set_clean
## <chr> <int> <dbl> <dbl> <dbl> <chr> <chr>
## 1 GOBP_PHOSPHORUS_ME… 1769 1.49e-120 0.327 1.09e-116 Baselin… PHOSPHOR…
## 2 GOBP_SMALL_MOLECUL… 1601 6.18e-104 0.318 2.26e-100 Baselin… SMALL MO…
## 3 GOBP_VESICLE_MEDIA… 1496 1.42e-102 0.326 3.46e- 99 Baselin… VESICLE …
## 4 GOBP_ESTABLISHMENT… 1631 8.05e-101 0.310 1.47e- 97 Baselin… ESTABLIS…
## 5 GOBP_PROTEIN_CONTA… 1710 6.46e- 94 0.292 9.44e- 91 Baselin… PROTEIN …
## 6 GOBP_REGULATION_OF… 1433 4.20e- 93 0.317 5.13e- 90 Baselin… REGULATI…
## 7 GOBP_NITROGEN_COMP… 1783 3.51e- 90 0.281 3.67e- 87 Baselin… NITROGEN…
## 8 GOBP_APOPTOTIC_PRO… 1764 2.72e- 88 0.279 2.49e- 85 Baselin… APOPTOTI…
## 9 GOBP_MACROMOLECULE… 1397 3.11e- 80 0.297 2.53e- 77 Baselin… MACROMOL…
## 10 GOBP_POSITIVE_REGU… 1704 4.37e- 80 0.270 3.20e- 77 Baselin… POSITIVE…
## # ℹ 30 more rows
# Hallmark pathways
hall_top10$contrast<- factor(
hall_top10$contrast,
levels= c(
"F Ang-II vs Saline",
"M Ang-II vs Saline",
"Interaction (M vs F)",
"Baseline M vs F"
)
)
ggplot(hall_top10,
aes(x = reorder(set_clean, s.dist), y = s.dist, fill = s.dist > 0)) +
geom_col() +
coord_flip() +
facet_wrap(~ contrast, scales = "free_y") +
scale_fill_manual(
values= c("TRUE" = "firebrick", "FALSE" = "steelblue"),
name= "Direction",
labels= c("FALSE" = "Enriched among genes lower in this contrast", "TRUE" = "Enriched among genes higher in this contrast")) +
labs(
title = "mitch: Hallmark pathways (top 10 per contrast)",
x = NULL,
y = "mitch s.dist") +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.text = element_text(face = "bold"))
# GO:BP pathways
gobp_top10$contrast<- factor(
gobp_top10$contrast,
levels = c(
"F Ang-II vs Saline",
"M Ang-II vs Saline",
"Interaction (M & F)",
"Baseline M vs F"
)
)
ggplot(gobp_top10,
aes(x = reorder(set_clean, s.dist), y = s.dist, fill = s.dist > 0)) +
geom_col() +
coord_flip() +
facet_wrap(~ contrast, scales = "free_y") +
scale_fill_manual(
values = c("TRUE" = "firebrick", "FALSE" = "steelblue"),
name = "Direction",
labels = c("FALSE" = "Enriched among genes lower in this contrast", "TRUE" = "Enriched among genes higher in this contrast")) +
labs(
title = "mitch: GO:BP pathways (top 10 per contrast)",
x = NULL,
y = "mitch s.dist") +
theme_minimal(base_size = 11) +
theme(
legend.position = "bottom",
strip.text = element_text(face = "bold"))
sessionInfo()
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
##
## locale:
## [1] LC_CTYPE=en_AU.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_AU.UTF-8 LC_COLLATE=en_AU.UTF-8
## [5] LC_MONETARY=en_AU.UTF-8 LC_MESSAGES=en_AU.UTF-8
## [7] LC_PAPER=en_AU.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_AU.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] fgsea_1.32.4
## [2] org.Mm.eg.db_3.20.0
## [3] rtracklayer_1.66.0
## [4] mitch_1.18.4
## [5] msigdbr_25.1.1
## [6] pheatmap_1.0.13
## [7] plotly_4.11.0
## [8] e1071_1.7-16
## [9] RColorBrewer_1.1-3
## [10] magrittr_2.0.3
## [11] lubridate_1.9.4
## [12] forcats_1.0.1
## [13] stringr_1.5.1
## [14] dplyr_1.1.4
## [15] readr_2.1.5
## [16] tidyr_1.3.1
## [17] tibble_3.2.1
## [18] tidyverse_2.0.0
## [19] readxl_1.4.5
## [20] AnnotationDbi_1.68.0
## [21] purrr_1.1.0
## [22] metafor_4.8-0
## [23] numDeriv_2016.8-1.1
## [24] metadat_1.4-0
## [25] Matrix_1.7-4
## [26] png_0.1-8
## [27] gridExtra_2.3
## [28] missMethyl_1.40.3
## [29] IlluminaHumanMethylationEPICv2anno.20a1.hg38_1.0.0
## [30] IlluminaHumanMethylationEPICanno.ilm10b4.hg19_0.6.0
## [31] IlluminaHumanMethylation450kanno.ilmn12.hg19_0.6.1
## [32] minfi_1.52.1
## [33] bumphunter_1.48.0
## [34] locfit_1.5-9.12
## [35] iterators_1.0.14
## [36] foreach_1.5.2
## [37] Biostrings_2.74.1
## [38] XVector_0.46.0
## [39] beeswarm_0.4.0
## [40] kableExtra_1.4.0
## [41] tictoc_1.2.1
## [42] HGNChelper_0.8.15
## [43] DESeq2_1.46.0
## [44] SummarizedExperiment_1.36.0
## [45] Biobase_2.66.0
## [46] MatrixGenerics_1.18.1
## [47] matrixStats_1.5.0
## [48] GenomicRanges_1.58.0
## [49] GenomeInfoDb_1.42.3
## [50] IRanges_2.40.1
## [51] S4Vectors_0.44.0
## [52] BiocGenerics_0.52.0
## [53] eulerr_7.0.4
## [54] apeglm_1.28.0
## [55] tximport_1.34.0
## [56] annotables_0.2.0
## [57] biomaRt_2.62.1
## [58] limma_3.62.2
## [59] EnhancedVolcano_1.24.0
## [60] ggrepel_0.9.6
## [61] ggplot2_4.0.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-9 httr_1.4.7
## [3] tools_4.4.3 doRNG_1.8.6.2
## [5] utf8_1.2.4 R6_2.6.1
## [7] HDF5Array_1.34.0 lazyeval_0.2.2
## [9] rhdf5filters_1.18.1 withr_3.0.2
## [11] prettyunits_1.2.0 GGally_2.4.0
## [13] base64_2.0.2 preprocessCore_1.68.0
## [15] cli_3.6.5 textshaping_1.0.4
## [17] labeling_0.4.3 sass_0.4.10
## [19] SQUAREM_2021.1 mvtnorm_1.3-3
## [21] S7_0.2.0 genefilter_1.88.0
## [23] proxy_0.4-27 askpass_1.2.1
## [25] mixsqp_0.3-54 Rsamtools_2.22.0
## [27] systemfonts_1.3.1 siggenes_1.80.0
## [29] illuminaio_0.48.0 svglite_2.2.2
## [31] rentrez_1.2.4 dichromat_2.0-0.1
## [33] scrime_1.3.5 invgamma_1.2
## [35] bbmle_1.0.25.1 rstudioapi_0.17.1
## [37] RSQLite_2.4.3 generics_0.1.3
## [39] BiocIO_1.16.0 vroom_1.6.5
## [41] gtools_3.9.5 abind_1.4-8
## [43] lifecycle_1.0.4 yaml_2.3.10
## [45] mathjaxr_1.8-0 gplots_3.2.0
## [47] rhdf5_2.50.2 SparseArray_1.6.2
## [49] BiocFileCache_2.14.0 grid_4.4.3
## [51] blob_1.2.4 promises_1.4.0
## [53] crayon_1.5.3 bdsmatrix_1.3-7
## [55] lattice_0.22-5 cowplot_1.2.0
## [57] echarts4r_0.4.6 GenomicFeatures_1.58.0
## [59] annotate_1.84.0 KEGGREST_1.46.0
## [61] pillar_1.11.1 knitr_1.50
## [63] beanplot_1.3.1 rjson_0.2.23
## [65] codetools_0.2-19 fastmatch_1.1-6
## [67] glue_1.8.0 data.table_1.17.8
## [69] vctrs_0.6.5 cellranger_1.1.0
## [71] gtable_0.3.6 assertthat_0.2.1
## [73] emdbook_1.3.14 cachem_1.1.0
## [75] xfun_0.54 mime_0.13
## [77] S4Arrays_1.6.0 coda_0.19-4.1
## [79] survival_3.8-3 statmod_1.5.0
## [81] nlme_3.1-168 bit64_4.6.0-1
## [83] progress_1.2.3 filelock_1.0.3
## [85] bslib_0.9.0 nor1mix_1.3-3
## [87] irlba_2.3.5.1 KernSmooth_2.23-26
## [89] otel_0.2.0 splitstackshape_1.4.8
## [91] DBI_1.2.3 tidyselect_1.2.1
## [93] bit_4.6.0 compiler_4.4.3
## [95] curl_7.0.0 httr2_1.2.1
## [97] xml2_1.4.1 DelayedArray_0.32.0
## [99] caTools_1.18.3 scales_1.4.0
## [101] quadprog_1.5-8 rappdirs_0.3.3
## [103] digest_0.6.37 rmarkdown_2.30
## [105] GEOquery_2.74.0 htmltools_0.5.8.1
## [107] pkgconfig_2.0.3 sparseMatrixStats_1.18.0
## [109] dbplyr_2.5.1 fastmap_1.2.0
## [111] rlang_1.1.6 htmlwidgets_1.6.4
## [113] UCSC.utils_1.2.0 shiny_1.11.1
## [115] DelayedMatrixStats_1.28.1 farver_2.1.2
## [117] jquerylib_0.1.4 jsonlite_2.0.0
## [119] BiocParallel_1.40.2 mclust_6.1.1
## [121] RCurl_1.98-1.17 GenomeInfoDbData_1.2.13
## [123] Rhdf5lib_1.28.0 Rcpp_1.1.0
## [125] babelgene_22.9 stringi_1.8.7
## [127] zlibbioc_1.52.0 MASS_7.3-65
## [129] plyr_1.8.9 org.Hs.eg.db_3.20.0
## [131] ggstats_0.11.0 splines_4.4.3
## [133] multtest_2.62.0 hms_1.1.3
## [135] rngtools_1.5.2 reshape2_1.4.4
## [137] XML_3.99-0.19 evaluate_1.0.5
## [139] tzdb_0.4.0 httpuv_1.6.16
## [141] openssl_2.3.4 reshape_0.8.10
## [143] ashr_2.2-67 xtable_1.8-4
## [145] restfulr_0.0.16 later_1.4.4
## [147] viridisLite_0.4.2 class_7.3-23
## [149] truncnorm_1.0-9 memoise_2.0.1
## [151] GenomicAlignments_1.42.0 timechange_0.3.0